library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)
library(ggRandomForests)AAQoL machine learning analysis with unbalanced random forest
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above")))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Demographics
ps(Ethnicity)# A tibble: 7 × 3
Ethnicity n pct
<fct> <int> <dbl>
1 Chinese 639 24.5
2 Asian Indian 574 22.0
3 Filipino 265 10.2
4 Korean 471 18.1
5 Other 144 5.52
6 Vietnamese 514 19.7
7 <NA> 2 0.0767
ps(Gender)# A tibble: 3 × 3
Gender n pct
<fct> <int> <dbl>
1 Female 1425 54.6
2 Male 1157 44.3
3 <NA> 27 1.03
ps(Religion)# A tibble: 8 × 3
Religion n pct
<fct> <int> <dbl>
1 Buddhist 350 13.4
2 Catholic 492 18.9
3 Hindu 479 18.4
4 Muslim 68 2.61
5 None 506 19.4
6 Other 47 1.80
7 Protestant 645 24.7
8 <NA> 22 0.843
ps(`Full Time Employment`)# A tibble: 3 × 3
`Full Time Employment` n pct
<fct> <int> <dbl>
1 0 1458 55.9
2 Employed full time 1144 43.8
3 <NA> 7 0.268
ps(Income)# A tibble: 9 × 3
Income n pct
<fct> <int> <dbl>
1 $0 - $9,999 254 9.74
2 $10,000 - $19,999 205 7.86
3 $20,000 - $29,999 198 7.59
4 $30,000 - $39,999 207 7.93
5 $40,000 - $49,999 181 6.94
6 $50,000 - $59,999 178 6.82
7 $60,000 - $69,999 190 7.28
8 $70,000 and over 993 38.1
9 <NA> 203 7.78
ps(`English Speaking`)# A tibble: 5 × 3
`English Speaking` n pct
<fct> <int> <dbl>
1 Not at all 177 6.78
2 Not well 632 24.2
3 Very well 974 37.3
4 Well 808 31.0
5 <NA> 18 0.690
ps(`English Difficulties`)# A tibble: 5 × 3
`English Difficulties` n pct
<fct> <int> <dbl>
1 Not at all 772 29.6
2 Much 549 21.0
3 Not much 733 28.1
4 Very much 516 19.8
5 <NA> 39 1.49
ps(Discrimination)# A tibble: 3 × 3
Discrimination n pct
<dbl> <int> <dbl>
1 0 1598 61.2
2 1 694 26.6
3 NA 317 12.2
qol |> summarize(age_mean = mean(Age,na.rm=T),
age_sd = sd(Age,na.rm=T),
age_min = min(Age,na.rm=T),
age_max = max(Age,na.rm=T))# A tibble: 1 × 4
age_mean age_sd age_min age_max
<dbl> <dbl> <dbl> <dbl>
1 42.9 17.1 18 98
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
# filter(!is.na(Family)) |>
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc",method="brf")
print(rfobj) Sample size: 1926
Frequency of class labels: 928, 998
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 528.6857
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swr
Resample size used to grow trees: 1856
Analysis: RF-C
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0754
(OOB) Brier score: 0.23092348
(OOB) Normalized Brier score: 0.92369391
(OOB) AUC: 0.65174638
(OOB) PR-AUC: 0.61532647
(OOB) G-mean: 0.60732292
(OOB) Requested performance error: 0.39267708
Confusion matrix:
predicted
observed No Yes class.error
No 610 318 0.3427
Yes 438 560 0.4389
(OOB) Misclassification rate: 0.3925234
print(rfobj) Sample size: 1926
Frequency of class labels: 928, 998
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 528.6857
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swr
Resample size used to grow trees: 1856
Analysis: RF-C
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0754
(OOB) Brier score: 0.23092348
(OOB) Normalized Brier score: 0.92369391
(OOB) AUC: 0.65174638
(OOB) PR-AUC: 0.61532647
(OOB) G-mean: 0.60732292
(OOB) Requested performance error: 0.39267708
Confusion matrix:
predicted
observed No Yes class.error
No 610 318 0.3427
Yes 438 560 0.4389
(OOB) Misclassification rate: 0.3925234
plot(rfobj,plots.one.page = FALSE)

all No Yes
Age 0.0314 NA NA
EnglishDiff 0.0159 NA NA
Ethnicity 0.0156 NA NA
Trust 0.0092 NA NA
Employment 0.0089 NA NA
Helpful.Family 0.0062 NA NA
Discrimination 0.0059 NA NA
Income_median 0.0059 NA NA
EnglishSpeak 0.0055 NA NA
Loyalty 0.0051 NA NA
Religion 0.0049 NA NA
See.Family 0.0048 NA NA
Dental.Insurance 0.0047 NA NA
Close.Family 0.0046 NA NA
Togetherness 0.0044 NA NA
Feel.Close 0.0040 NA NA
Successful.Family 0.0036 NA NA
Community.Shares.Values 0.0035 NA NA
Religious.Attendance 0.0035 NA NA
Expression 0.0035 NA NA
Community.Trust 0.0034 NA NA
Helpful.Community 0.0030 NA NA
Similar.Values 0.0029 NA NA
Spend.Time.Together 0.0012 NA NA
Religious.Importance 0.0007 NA NA
Close.Friends 0.0000 NA NA
rfobj$importance all No Yes
Ethnicity 1.561676e-02 NA NA
Age 3.136262e-02 NA NA
Gender -1.289370e-03 NA NA
Religion 4.869572e-03 NA NA
Employment 8.869707e-03 NA NA
Income_median 5.881249e-03 NA NA
EnglishSpeak 5.466686e-03 NA NA
EnglishDiff 1.589785e-02 NA NA
See.Family 4.815807e-03 NA NA
Close.Family 4.648270e-03 NA NA
Helpful.Family 6.200891e-03 NA NA
See.Friends -2.383014e-03 NA NA
Close.Friends -3.022316e-05 NA NA
Helpful.Friends -4.584730e-03 NA NA
Family.Respect -6.645626e-04 NA NA
Similar.Values 2.851297e-03 NA NA
Successful.Family 3.594840e-03 NA NA
Trust 9.233369e-03 NA NA
Loyalty 5.073923e-03 NA NA
Family.Pride -1.741601e-03 NA NA
Expression 3.494694e-03 NA NA
Spend.Time.Together 1.190567e-03 NA NA
Feel.Close 3.958902e-03 NA NA
Togetherness 4.427049e-03 NA NA
Religious.Attendance 3.534035e-03 NA NA
Religious.Importance 7.409379e-04 NA NA
Close.knit.Community -1.842641e-03 NA NA
Helpful.Community 2.955803e-03 NA NA
Community.Shares.Values 3.534035e-03 NA NA
Get.Along -1.307111e-03 NA NA
Community.Trust 3.355233e-03 NA NA
Health.Insurance -8.533212e-05 NA NA
Dental.Insurance 4.749503e-03 NA NA
Discrimination 5.888430e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100) |>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
Cross validation in Random Forests (Run when you have time)
# myTrainingControl <- trainControl(method = "repeatedcv",
# number = 10,
# repeats = 3,
# savePredictions = TRUE,
# classProbs = TRUE,
# verboseIter = TRUE,
# search = "grid")
#
#
# set.seed(123)
#
# model_rf <- train(Family~ .,
# data=rfdata,
# method = 'rf',
# metric = "Accuracy",
# trControl = myTrainingControl,
# importance = TRUE
# )
#
# varImp(model_rf)Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Family~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
#
# rfsrc(Family~.,data=train, importance="permute",
# perf.type="gmean",
# splitrule="auc",
# block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1542
Frequency of class labels: 756, 786
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 335.866
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 975
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0397
(OOB) Brier score: 0.18674628
(OOB) Normalized Brier score: 0.74698513
(OOB) AUC: 0.85467195
(OOB) PR-AUC: 0.85195658
(OOB) G-mean: 0.77096032
(OOB) Requested performance error: 0.22903968
Confusion matrix:
predicted
observed Yes No class.error
Yes 579 177 0.2341
No 176 610 0.2239
(OOB) Misclassification rate: 0.2289235
plot(rfobj,plots.one.page = FALSE)

all Yes No
Ethnicity 0.0247 NA NA
Close.Family 0.0186 NA NA
EnglishDiff 0.0182 NA NA
Discrimination 0.0167 NA NA
Close.Friends 0.0135 NA NA
Religious.Attendance 0.0131 NA NA
Expression 0.0129 NA NA
Employment 0.0122 NA NA
Helpful.Friends 0.0112 NA NA
Successful.Family 0.0111 NA NA
Family.Respect 0.0078 NA NA
Religion 0.0078 NA NA
Age 0.0078 NA NA
Religious.Importance 0.0077 NA NA
Community.Shares.Values 0.0070 NA NA
Community.Trust 0.0069 NA NA
See.Family 0.0069 NA NA
Feel.Close 0.0064 NA NA
EnglishSpeak 0.0058 NA NA
Close.knit.Community 0.0050 NA NA
Income_median 0.0045 NA NA
Gender 0.0041 NA NA
Get.Along 0.0039 NA NA
Similar.Values 0.0037 NA NA
Togetherness 0.0033 NA NA
Dental.Insurance 0.0025 NA NA
rfobj$importance all Yes No
Ethnicity 0.024656644 NA NA
Age 0.007786248 NA NA
Gender 0.004090596 NA NA
Religion 0.007786248 NA NA
Employment 0.012172708 NA NA
Income_median 0.004463751 NA NA
EnglishSpeak 0.005822910 NA NA
EnglishDiff 0.018241775 NA NA
See.Family 0.006882684 NA NA
Close.Family 0.018638725 NA NA
Helpful.Family 0.002477147 NA NA
See.Friends 0.001179641 NA NA
Close.Friends 0.013519038 NA NA
Helpful.Friends 0.011207441 NA NA
Family.Respect 0.007786248 NA NA
Similar.Values 0.003708850 NA NA
Successful.Family 0.011084516 NA NA
Trust 0.001841249 NA NA
Loyalty 0.001179641 NA NA
Family.Pride 0.001841249 NA NA
Expression 0.012857228 NA NA
Spend.Time.Together 0.001205876 NA NA
Feel.Close 0.006424699 NA NA
Togetherness 0.003297697 NA NA
Religious.Attendance 0.013090351 NA NA
Religious.Importance 0.007693639 NA NA
Close.knit.Community 0.005047992 NA NA
Helpful.Community 0.001369863 NA NA
Community.Shares.Values 0.007031368 NA NA
Get.Along 0.003860209 NA NA
Community.Trust 0.006943256 NA NA
Health.Insurance 0.001929871 NA NA
Dental.Insurance 0.002477147 NA NA
Discrimination 0.016650486 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="family_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
199.0000000 185.0000000 1.0756757 0.4817708 0.5567568 0.5326633
prec npv misclass brier brier.norm auc
0.5255102 0.5638298 0.4557292 0.2425106 0.9700425 0.5905745
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5406824 0.5442197 0.4817708 0.5710029 0.5426296 0.5443983
gmean
0.5445768
test_rf$importance all Yes No
Ethnicity 8.324236e-03 NA NA
Age 2.019169e-02 NA NA
Gender 1.022131e-03 NA NA
Religion 3.149065e-03 NA NA
Employment -3.321338e-04 NA NA
Income_median 4.295393e-04 NA NA
EnglishSpeak 2.228164e-03 NA NA
EnglishDiff 1.568637e-03 NA NA
See.Family -1.910117e-03 NA NA
Close.Family -1.940504e-03 NA NA
Helpful.Family 6.466441e-03 NA NA
See.Friends 5.652327e-03 NA NA
Close.Friends 2.840038e-04 NA NA
Helpful.Friends -5.760976e-04 NA NA
Family.Respect -2.849889e-04 NA NA
Similar.Values -1.034093e-03 NA NA
Successful.Family 3.978352e-05 NA NA
Trust -2.123316e-03 NA NA
Loyalty 5.968539e-04 NA NA
Family.Pride 2.709423e-04 NA NA
Expression 1.577996e-03 NA NA
Spend.Time.Together 2.257353e-04 NA NA
Feel.Close 2.696780e-04 NA NA
Togetherness 2.825447e-03 NA NA
Religious.Attendance 1.433416e-03 NA NA
Religious.Importance -3.354434e-03 NA NA
Close.knit.Community -9.100764e-05 NA NA
Helpful.Community -1.937361e-03 NA NA
Community.Shares.Values 9.637355e-04 NA NA
Get.Along 2.370701e-03 NA NA
Community.Trust 1.025102e-03 NA NA
Health.Insurance -5.639420e-05 NA NA
Dental.Insurance 4.379893e-04 NA NA
Discrimination 2.733514e-04 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="family_test_VIMP.png",width=5,height=5,units="in")Source of Information: Health Professionals
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imbalanced(Heal.Professionals ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")->rfobj
print(rfobj) Sample size: 1927
Frequency of class labels: 925, 1002
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 529.8617
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1218
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0832
(OOB) Brier score: 0.22739259
(OOB) Normalized Brier score: 0.90957038
(OOB) AUC: 0.67375951
(OOB) PR-AUC: 0.63055041
(OOB) G-mean: 0.62051216
(OOB) Requested performance error: 0.37948784
Confusion matrix:
predicted
observed No Yes class.error
No 562 363 0.3924
Yes 367 635 0.3663
(OOB) Misclassification rate: 0.3788272
plot(rfobj,plots.one.page = FALSE)

all No Yes
EnglishSpeak 0.0087 NA NA
Get.Along 0.0077 NA NA
Community.Shares.Values 0.0076 NA NA
Spend.Time.Together 0.0068 NA NA
Expression 0.0065 NA NA
Gender 0.0064 NA NA
Age 0.0056 NA NA
Similar.Values 0.0049 NA NA
Health.Insurance 0.0049 NA NA
Feel.Close 0.0048 NA NA
Community.Trust 0.0045 NA NA
Family.Pride 0.0045 NA NA
Income_median 0.0041 NA NA
Discrimination 0.0039 NA NA
Dental.Insurance 0.0034 NA NA
Helpful.Community 0.0031 NA NA
Family.Respect 0.0030 NA NA
Trust 0.0026 NA NA
Close.knit.Community 0.0012 NA NA
Loyalty 0.0006 NA NA
Religious.Importance -0.0008 NA NA
Togetherness -0.0008 NA NA
Ethnicity -0.0018 NA NA
Religious.Attendance -0.0021 NA NA
Religion -0.0023 NA NA
Helpful.Family -0.0023 NA NA
rfobj$importance all No Yes
Ethnicity -0.0018143542 NA NA
Age 0.0056014941 NA NA
Gender 0.0063757609 NA NA
Religion -0.0022995991 NA NA
Employment -0.0024295113 NA NA
Income_median 0.0041083863 NA NA
EnglishSpeak 0.0086779884 NA NA
EnglishDiff -0.0028554330 NA NA
See.Family -0.0040666801 NA NA
Close.Family -0.0026329858 NA NA
Helpful.Family -0.0023203869 NA NA
See.Friends -0.0059329832 NA NA
Close.Friends -0.0071864210 NA NA
Helpful.Friends -0.0065896944 NA NA
Family.Respect 0.0029979134 NA NA
Similar.Values 0.0049175798 NA NA
Successful.Family -0.0029246611 NA NA
Trust 0.0026337288 NA NA
Loyalty 0.0005523029 NA NA
Family.Pride 0.0045040927 NA NA
Expression 0.0065145648 NA NA
Spend.Time.Together 0.0067966620 NA NA
Feel.Close 0.0048483539 NA NA
Togetherness -0.0008427275 NA NA
Religious.Attendance -0.0020812857 NA NA
Religious.Importance -0.0007880284 NA NA
Close.knit.Community 0.0012392363 NA NA
Helpful.Community 0.0030590686 NA NA
Community.Shares.Values 0.0075557164 NA NA
Get.Along 0.0077018301 NA NA
Community.Trust 0.0045040927 NA NA
Health.Insurance 0.0049053114 NA NA
Dental.Insurance 0.0033728024 NA NA
Discrimination 0.0039211271 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
pos<- rfdata |> filter(Heal.Professionals=="Yes")
neg <- rfdata |> filter(Heal.Professionals==0)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Heal.Professionals~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Heal.Professionals ~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1542
Frequency of class labels: 756, 786
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 322.849
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 975
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0397
(OOB) Brier score: 0.17657038
(OOB) Normalized Brier score: 0.70628152
(OOB) AUC: 0.85464797
(OOB) PR-AUC: 0.84380916
(OOB) G-mean: 0.77511401
(OOB) Requested performance error: 0.22488599
Confusion matrix:
predicted
observed Yes No class.error
Yes 597 159 0.2103
No 188 598 0.2392
(OOB) Misclassification rate: 0.2250324
print(rfobj) Sample size: 1542
Frequency of class labels: 756, 786
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 322.849
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 975
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0397
(OOB) Brier score: 0.17657038
(OOB) Normalized Brier score: 0.70628152
(OOB) AUC: 0.85464797
(OOB) PR-AUC: 0.84380916
(OOB) G-mean: 0.77511401
(OOB) Requested performance error: 0.22488599
Confusion matrix:
predicted
observed Yes No class.error
Yes 597 159 0.2103
No 188 598 0.2392
(OOB) Misclassification rate: 0.2250324
plot(rfobj,plots.one.page = FALSE)

all Yes No
EnglishSpeak 0.0311 NA NA
EnglishDiff 0.0209 NA NA
Dental.Insurance 0.0201 NA NA
Ethnicity 0.0164 NA NA
Religious.Importance 0.0162 NA NA
See.Friends 0.0149 NA NA
Income_median 0.0143 NA NA
Religion 0.0130 NA NA
Community.Trust 0.0123 NA NA
Close.Family 0.0118 NA NA
See.Family 0.0117 NA NA
Successful.Family 0.0110 NA NA
Get.Along 0.0104 NA NA
Close.knit.Community 0.0104 NA NA
Religious.Attendance 0.0104 NA NA
Feel.Close 0.0104 NA NA
Helpful.Family 0.0093 NA NA
Age 0.0091 NA NA
Discrimination 0.0078 NA NA
Helpful.Friends 0.0071 NA NA
Helpful.Community 0.0066 NA NA
Community.Shares.Values 0.0065 NA NA
Togetherness 0.0058 NA NA
Family.Respect 0.0058 NA NA
Similar.Values 0.0058 NA NA
Employment 0.0045 NA NA
rfobj$importance all Yes No
Ethnicity 0.0164029140 NA NA
Age 0.0090830369 NA NA
Gender 0.0039245209 NA NA
Religion 0.0130344581 NA NA
Employment 0.0045412390 NA NA
Income_median 0.0143163422 NA NA
EnglishSpeak 0.0311343089 NA NA
EnglishDiff 0.0209034590 NA NA
See.Family 0.0116885920 NA NA
Close.Family 0.0117547266 NA NA
Helpful.Family 0.0092522165 NA NA
See.Friends 0.0149282081 NA NA
Close.Friends 0.0013581594 NA NA
Helpful.Friends 0.0071357674 NA NA
Family.Respect 0.0058385032 NA NA
Similar.Values 0.0058374094 NA NA
Successful.Family 0.0110275602 NA NA
Trust 0.0039048813 NA NA
Loyalty 0.0038917886 NA NA
Family.Pride 0.0038939707 NA NA
Expression 0.0012972629 NA NA
Spend.Time.Together 0.0019728304 NA NA
Feel.Close 0.0103847047 NA NA
Togetherness 0.0058385032 NA NA
Religious.Attendance 0.0103913067 NA NA
Religious.Importance 0.0162199437 NA NA
Close.knit.Community 0.0104001093 NA NA
Helpful.Community 0.0066308331 NA NA
Community.Shares.Values 0.0064928827 NA NA
Get.Along 0.0104111128 NA NA
Community.Trust 0.0123248245 NA NA
Health.Insurance 0.0006527058 NA NA
Dental.Insurance 0.0201072957 NA NA
Discrimination 0.0077967363 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="healthpro_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
200.0000000 185.0000000 1.0810811 0.4805195 0.6000000 0.6200000
prec npv misclass brier brier.norm auc
0.5935829 0.6262626 0.3896104 0.2335855 0.9343418 0.6500811
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5967742 0.6096605 0.4805195 0.6277839 0.6033461 0.6097893
gmean
0.6099180
test_rf$importance all Yes No
Ethnicity 1.416251e-03 NA NA
Age 3.833091e-04 NA NA
Gender 5.709630e-04 NA NA
Religion 3.265128e-04 NA NA
Employment 5.719352e-05 NA NA
Income_median 3.094068e-03 NA NA
EnglishSpeak 1.868566e-02 NA NA
EnglishDiff 1.095154e-02 NA NA
See.Family 2.481570e-03 NA NA
Close.Family 5.369103e-04 NA NA
Helpful.Family 2.884903e-03 NA NA
See.Friends 1.065651e-03 NA NA
Close.Friends 1.526279e-03 NA NA
Helpful.Friends 4.299457e-04 NA NA
Family.Respect 1.237963e-03 NA NA
Similar.Values -2.771710e-03 NA NA
Successful.Family -9.325210e-04 NA NA
Trust -1.068838e-03 NA NA
Loyalty 4.830675e-04 NA NA
Family.Pride 5.179147e-04 NA NA
Expression 1.251865e-03 NA NA
Spend.Time.Together 6.868427e-04 NA NA
Feel.Close 7.449733e-04 NA NA
Togetherness -2.894697e-04 NA NA
Religious.Attendance 1.826614e-03 NA NA
Religious.Importance 9.029164e-04 NA NA
Close.knit.Community -3.583688e-05 NA NA
Helpful.Community 1.156417e-03 NA NA
Community.Shares.Values -1.892850e-03 NA NA
Get.Along -6.693266e-04 NA NA
Community.Trust 2.128981e-03 NA NA
Health.Insurance 3.435879e-03 NA NA
Dental.Insurance 8.937844e-03 NA NA
Discrimination 4.512359e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="healthpro_test_VIMP.png",width=5,height=5,units="in")Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1936
Frequency of class labels: 259, 1677
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 295.831
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 1224
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 6.4749
(OOB) Brier score: 0.10519154
(OOB) Normalized Brier score: 0.42076617
(OOB) AUC: 0.7338273
(OOB) PR-AUC: 0.32141853
(OOB) G-mean: 0.66878662
(OOB) Requested performance error: 0.33121338
Confusion matrix:
predicted
observed 0 Yes class.error
0 203 56 0.2162
Yes 720 957 0.4293
(OOB) Misclassification rate: 0.4008264
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1677.0000000 259.0000000 6.4749035 0.1337810 0.7837838 0.5706619
prec npv misclass brier brier.norm auc
0.2199350 0.9447187 0.4008264 0.1051915 0.4207662 0.7338273
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3434856 0.4633100 0.1337810 0.3214185 0.5061361 0.5660483
gmean
0.6687866
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
#
#
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1550
Frequency of class labels: 760, 790
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 278.29
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 980
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0395
(OOB) Brier score: 0.12406003
(OOB) Normalized Brier score: 0.49624012
(OOB) AUC: 0.9650458
(OOB) PR-AUC: 0.96253384
(OOB) G-mean: 0.89247353
(OOB) Requested performance error: 0.10752647
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 656 104 0.1368
0 61 729 0.0772
(OOB) Misclassification rate: 0.1064516
print(rfobj) Sample size: 1550
Frequency of class labels: 760, 790
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 278.29
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 980
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0395
(OOB) Brier score: 0.12406003
(OOB) Normalized Brier score: 0.49624012
(OOB) AUC: 0.9650458
(OOB) PR-AUC: 0.96253384
(OOB) G-mean: 0.89247353
(OOB) Requested performance error: 0.10752647
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 656 104 0.1368
0 61 729 0.0772
(OOB) Misclassification rate: 0.1064516
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Religion 0.0225 NA NA
EnglishSpeak 0.0222 NA NA
EnglishDiff 0.0149 NA NA
Community.Shares.Values 0.0135 NA NA
Income_median 0.0129 NA NA
Close.Family 0.0125 NA NA
Community.Trust 0.0122 NA NA
Ethnicity 0.0116 NA NA
Religious.Attendance 0.0114 NA NA
Helpful.Community 0.0101 NA NA
Close.Friends 0.0096 NA NA
Helpful.Family 0.0092 NA NA
Employment 0.0089 NA NA
See.Friends 0.0087 NA NA
Get.Along 0.0074 NA NA
Religious.Importance 0.0074 NA NA
Age 0.0073 NA NA
Family.Respect 0.0067 NA NA
Expression 0.0052 NA NA
Helpful.Friends 0.0046 NA NA
Togetherness 0.0045 NA NA
Close.knit.Community 0.0041 NA NA
Family.Pride 0.0039 NA NA
Feel.Close 0.0037 NA NA
See.Family 0.0036 NA NA
Similar.Values 0.0034 NA NA
rfobj$importance all Yes 0
Ethnicity 0.011608770 NA NA
Age 0.007319489 NA NA
Gender 0.003407701 NA NA
Religion 0.022516372 NA NA
Employment 0.008935426 NA NA
Income_median 0.012900201 NA NA
EnglishSpeak 0.022248379 NA NA
EnglishDiff 0.014910657 NA NA
See.Family 0.003558520 NA NA
Close.Family 0.012486547 NA NA
Helpful.Family 0.009229550 NA NA
See.Friends 0.008726205 NA NA
Close.Friends 0.009591684 NA NA
Helpful.Friends 0.004628107 NA NA
Family.Respect 0.006676196 NA NA
Similar.Values 0.003407701 NA NA
Successful.Family 0.002043055 NA NA
Trust 0.002517355 NA NA
Loyalty 0.001972914 NA NA
Family.Pride 0.003946460 NA NA
Expression 0.005169485 NA NA
Spend.Time.Together 0.003196971 NA NA
Feel.Close 0.003744036 NA NA
Togetherness 0.004489298 NA NA
Religious.Attendance 0.011370559 NA NA
Religious.Importance 0.007434278 NA NA
Close.knit.Community 0.004117995 NA NA
Helpful.Community 0.010143658 NA NA
Community.Shares.Values 0.013535732 NA NA
Get.Along 0.007434278 NA NA
Community.Trust 0.012243368 NA NA
Discrimination 0.001096147 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="HIns_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
335.0000000 51.0000000 6.5686275 0.1321244 0.6470588 0.5492537
prec npv misclass brier brier.norm auc
0.1793478 0.9108911 0.4378238 0.1121928 0.4487711 0.6093649
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.2808511 0.3984187 0.1321244 0.1919435 0.4385025 0.4972863
gmean
0.5961539
test_rf$importance all Yes 0
Ethnicity -0.0007170282 NA NA
Age -0.0065876180 NA NA
Gender 0.0016858270 NA NA
Religion 0.0032049218 NA NA
Employment 0.0018750855 NA NA
Income_median 0.0266195366 NA NA
EnglishSpeak 0.0087105042 NA NA
EnglishDiff -0.0040702210 NA NA
See.Family 0.0023935099 NA NA
Close.Family -0.0060360251 NA NA
Helpful.Family -0.0037656635 NA NA
See.Friends -0.0030925577 NA NA
Close.Friends 0.0010820995 NA NA
Helpful.Friends 0.0030769449 NA NA
Family.Respect 0.0017744430 NA NA
Similar.Values -0.0015062104 NA NA
Successful.Family -0.0028634419 NA NA
Trust 0.0005119584 NA NA
Loyalty -0.0021556567 NA NA
Family.Pride 0.0005819887 NA NA
Expression 0.0011362520 NA NA
Spend.Time.Together 0.0007929601 NA NA
Feel.Close 0.0002449711 NA NA
Togetherness 0.0028090090 NA NA
Religious.Attendance 0.0053798612 NA NA
Religious.Importance 0.0005212834 NA NA
Close.knit.Community 0.0017700534 NA NA
Helpful.Community -0.0028534784 NA NA
Community.Shares.Values -0.0011828800 NA NA
Get.Along -0.0014814092 NA NA
Community.Trust -0.0024477784 NA NA
Discrimination 0.0008231928 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="HIns_test_VIMP.png",width=5,height=5,units="in")Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1932
Frequency of class labels: 760, 1172
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 451.0523
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 1221
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.5421
(OOB) Brier score: 0.17933324
(OOB) Normalized Brier score: 0.71733296
(OOB) AUC: 0.79775743
(OOB) PR-AUC: 0.71577963
(OOB) G-mean: 0.72787095
(OOB) Requested performance error: 0.27212905
Confusion matrix:
predicted
observed 0 Yes class.error
0 572 188 0.2474
Yes 347 825 0.2961
(OOB) Misclassification rate: 0.2769151
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1172.0000000 760.0000000 1.5421053 0.3933747 0.7526316 0.7039249
prec npv misclass brier brier.norm auc
0.6224157 0.8144126 0.2769151 0.1793332 0.7173330 0.7977574
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6813580 0.7163581 0.3933747 0.7157796 0.7046145 0.7221145
gmean
0.7278710
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1546
Frequency of class labels: 757, 789
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 285.066
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 977
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0423
(OOB) Brier score: 0.13945449
(OOB) Normalized Brier score: 0.55781796
(OOB) AUC: 0.91210887
(OOB) PR-AUC: 0.90552285
(OOB) G-mean: 0.84478415
(OOB) Requested performance error: 0.15521585
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 625 132 0.1744
0 107 682 0.1356
(OOB) Misclassification rate: 0.1545925
print(rfobj) Sample size: 1546
Frequency of class labels: 757, 789
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 285.066
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 977
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0423
(OOB) Brier score: 0.13945449
(OOB) Normalized Brier score: 0.55781796
(OOB) AUC: 0.91210887
(OOB) PR-AUC: 0.90552285
(OOB) G-mean: 0.84478415
(OOB) Requested performance error: 0.15521585
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 625 132 0.1744
0 107 682 0.1356
(OOB) Misclassification rate: 0.1545925
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Income_median 0.0348 NA NA
EnglishSpeak 0.0324 NA NA
Age 0.0303 NA NA
Ethnicity 0.0293 NA NA
Employment 0.0257 NA NA
Religion 0.0234 NA NA
EnglishDiff 0.0226 NA NA
See.Friends 0.0197 NA NA
Helpful.Family 0.0184 NA NA
Close.Family 0.0152 NA NA
Community.Shares.Values 0.0142 NA NA
Close.knit.Community 0.0141 NA NA
Helpful.Community 0.0136 NA NA
Close.Friends 0.0133 NA NA
Discrimination 0.0130 NA NA
See.Family 0.0123 NA NA
Expression 0.0115 NA NA
Community.Trust 0.0114 NA NA
Helpful.Friends 0.0114 NA NA
Get.Along 0.0110 NA NA
Religious.Importance 0.0092 NA NA
Family.Respect 0.0091 NA NA
Feel.Close 0.0084 NA NA
Similar.Values 0.0078 NA NA
Religious.Attendance 0.0077 NA NA
Togetherness 0.0077 NA NA
rfobj$importance all Yes 0
Ethnicity 0.029295012 NA NA
Age 0.030250260 NA NA
Gender 0.005238544 NA NA
Religion 0.023372807 NA NA
Employment 0.025687492 NA NA
Income_median 0.034751917 NA NA
EnglishSpeak 0.032383582 NA NA
EnglishDiff 0.022637313 NA NA
See.Family 0.012275885 NA NA
Close.Family 0.015189066 NA NA
Helpful.Family 0.018379898 NA NA
See.Friends 0.019742478 NA NA
Close.Friends 0.013263933 NA NA
Helpful.Friends 0.011443697 NA NA
Family.Respect 0.009066287 NA NA
Similar.Values 0.007771092 NA NA
Successful.Family 0.005800118 NA NA
Trust 0.005124879 NA NA
Loyalty 0.006216303 NA NA
Family.Pride 0.006475900 NA NA
Expression 0.011493926 NA NA
Spend.Time.Together 0.005019204 NA NA
Feel.Close 0.008390413 NA NA
Togetherness 0.007715085 NA NA
Religious.Attendance 0.007715085 NA NA
Religious.Importance 0.009184496 NA NA
Close.knit.Community 0.014083837 NA NA
Helpful.Community 0.013571050 NA NA
Community.Shares.Values 0.014247108 NA NA
Get.Along 0.011037954 NA NA
Community.Trust 0.011443697 NA NA
Discrimination 0.012951897 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="DIns_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
234.0000000 152.0000000 1.5394737 0.3937824 0.8223684 0.6880342
prec npv misclass brier brier.norm auc
0.6313131 0.8563830 0.2590674 0.1835812 0.7343248 0.7969242
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.7142857 0.7378552 0.3937824 0.6985360 0.7332471 0.7450318
gmean
0.7522085
test_rf$importance all Yes 0
Ethnicity 0.0074850819 NA NA
Age 0.0138419389 NA NA
Gender 0.0011070968 NA NA
Religion 0.0006469828 NA NA
Employment 0.0171435775 NA NA
Income_median 0.0667999061 NA NA
EnglishSpeak 0.0225371650 NA NA
EnglishDiff 0.0062544370 NA NA
See.Family -0.0009065089 NA NA
Close.Family 0.0030245757 NA NA
Helpful.Family 0.0052611352 NA NA
See.Friends 0.0007200479 NA NA
Close.Friends 0.0046701490 NA NA
Helpful.Friends 0.0052013977 NA NA
Family.Respect 0.0003192247 NA NA
Similar.Values 0.0008803404 NA NA
Successful.Family 0.0003647281 NA NA
Trust 0.0021871845 NA NA
Loyalty 0.0010393549 NA NA
Family.Pride 0.0023372421 NA NA
Expression 0.0027105884 NA NA
Spend.Time.Together 0.0026602853 NA NA
Feel.Close 0.0011880424 NA NA
Togetherness 0.0011204948 NA NA
Religious.Attendance 0.0020460395 NA NA
Religious.Importance -0.0002700353 NA NA
Close.knit.Community 0.0013556106 NA NA
Helpful.Community 0.0006672165 NA NA
Community.Shares.Values 0.0013819268 NA NA
Get.Along 0.0034466808 NA NA
Community.Trust 0.0001582721 NA NA
Discrimination -0.0012461848 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="DIns_test_VIMP.png",width=5,height=5,units="in")Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1918
Frequency of class labels: 614, 1304
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 451.908
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1212
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 2.1238
(OOB) Brier score: 0.18414117
(OOB) Normalized Brier score: 0.73656469
(OOB) AUC: 0.74369423
(OOB) PR-AUC: 0.55768871
(OOB) G-mean: 0.6964641
(OOB) Requested performance error: 0.3035359
Confusion matrix:
predicted
observed 0 Yes class.error
0 432 182 0.2964
Yes 405 899 0.3106
(OOB) Misclassification rate: 0.306048
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0451 NA NA
Health.Insurance 0.0367 NA NA
Dental.Insurance 0.0277 NA NA
Gender 0.0172 NA NA
Income_median 0.0074 NA NA
EnglishDiff 0.0061 NA NA
Community.Shares.Values 0.0055 NA NA
Employment 0.0055 NA NA
Discrimination 0.0045 NA NA
Togetherness 0.0043 NA NA
EnglishSpeak 0.0035 NA NA
Helpful.Family 0.0032 NA NA
Close.knit.Community 0.0029 NA NA
Religious.Importance 0.0023 NA NA
Close.Family 0.0019 NA NA
Get.Along 0.0019 NA NA
Religion 0.0016 NA NA
See.Family 0.0016 NA NA
Ethnicity 0.0012 NA NA
Loyalty 0.0008 NA NA
Family.Respect 0.0000 NA NA
Trust -0.0001 NA NA
Religious.Attendance -0.0005 NA NA
See.Friends -0.0008 NA NA
Community.Trust -0.0008 NA NA
Family.Pride -0.0015 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1304.0000000 614.0000000 2.1237785 0.3201251 0.7035831 0.6894172
prec npv misclass brier brier.norm auc
0.5161290 0.8316374 0.3060480 0.1841412 0.7365647 0.7436942
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5954514 0.6653643 0.3201251 0.5576887 0.6459578 0.6809142
gmean
0.6964641
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1536
Frequency of class labels: 754, 782
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 296.945
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 971
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0371
(OOB) Brier score: 0.14645696
(OOB) Normalized Brier score: 0.58582786
(OOB) AUC: 0.91847572
(OOB) PR-AUC: 0.91302995
(OOB) G-mean: 0.84643071
(OOB) Requested performance error: 0.15356929
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 658 96 0.1273
0 140 642 0.1790
(OOB) Misclassification rate: 0.1536458
print(rfobj) Sample size: 1536
Frequency of class labels: 754, 782
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 296.945
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 971
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0371
(OOB) Brier score: 0.14645696
(OOB) Normalized Brier score: 0.58582786
(OOB) AUC: 0.91847572
(OOB) PR-AUC: 0.91302995
(OOB) G-mean: 0.84643071
(OOB) Requested performance error: 0.15356929
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 658 96 0.1273
0 140 642 0.1790
(OOB) Misclassification rate: 0.1536458
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Age 0.0306 NA NA
Ethnicity 0.0290 NA NA
Health.Insurance 0.0253 NA NA
Religion 0.0209 NA NA
Discrimination 0.0190 NA NA
EnglishDiff 0.0178 NA NA
Gender 0.0177 NA NA
Helpful.Family 0.0151 NA NA
EnglishSpeak 0.0145 NA NA
Close.Family 0.0131 NA NA
See.Family 0.0123 NA NA
Religious.Importance 0.0120 NA NA
Dental.Insurance 0.0110 NA NA
Spend.Time.Together 0.0106 NA NA
See.Friends 0.0092 NA NA
Religious.Attendance 0.0087 NA NA
Close.Friends 0.0085 NA NA
Income_median 0.0085 NA NA
Helpful.Community 0.0079 NA NA
Helpful.Friends 0.0072 NA NA
Close.knit.Community 0.0067 NA NA
Successful.Family 0.0066 NA NA
Community.Shares.Values 0.0060 NA NA
Family.Respect 0.0040 NA NA
Expression 0.0039 NA NA
Community.Trust 0.0034 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0290438628 NA NA
Age 0.0305609750 NA NA
Gender 0.0176675835 NA NA
Religion 0.0209462698 NA NA
Employment 0.0033749309 NA NA
Income_median 0.0084739447 NA NA
EnglishSpeak 0.0145220898 NA NA
EnglishDiff 0.0177954937 NA NA
See.Family 0.0123384853 NA NA
Close.Family 0.0130819988 NA NA
Helpful.Family 0.0150982127 NA NA
See.Friends 0.0092231420 NA NA
Close.Friends 0.0085124007 NA NA
Helpful.Friends 0.0072321580 NA NA
Family.Respect 0.0039877196 NA NA
Similar.Values 0.0006775183 NA NA
Successful.Family 0.0065927686 NA NA
Trust 0.0032823973 NA NA
Loyalty 0.0019799576 NA NA
Family.Pride 0.0033749309 NA NA
Expression 0.0039434310 NA NA
Spend.Time.Together 0.0106139711 NA NA
Feel.Close 0.0033025124 NA NA
Togetherness 0.0032823973 NA NA
Religious.Attendance 0.0087107806 NA NA
Religious.Importance 0.0119857772 NA NA
Close.knit.Community 0.0067058637 NA NA
Helpful.Community 0.0078952941 NA NA
Community.Shares.Values 0.0059538655 NA NA
Get.Along 0.0013595936 NA NA
Community.Trust 0.0033749309 NA NA
Health.Insurance 0.0253433804 NA NA
Dental.Insurance 0.0110361377 NA NA
Discrimination 0.0189547483 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="PChk_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
260.0000000 122.0000000 2.1311475 0.3193717 0.5983607 0.7038462
prec npv misclass brier brier.norm auc
0.4866667 0.7887931 0.3298429 0.1941655 0.7766620 0.6926860
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5367647 0.6235821 0.3193717 0.5100775 0.5928642 0.6362729
gmean
0.6489637
test_rf$importance all Yes 0
Ethnicity -4.485200e-03 NA NA
Age 1.456814e-02 NA NA
Gender 1.332700e-03 NA NA
Religion 2.523687e-03 NA NA
Employment 1.750327e-04 NA NA
Income_median 5.524786e-03 NA NA
EnglishSpeak -9.543476e-04 NA NA
EnglishDiff 1.994580e-03 NA NA
See.Family -2.830507e-03 NA NA
Close.Family -8.254770e-05 NA NA
Helpful.Family 1.710651e-03 NA NA
See.Friends 1.754683e-03 NA NA
Close.Friends -1.154867e-03 NA NA
Helpful.Friends 5.813948e-04 NA NA
Family.Respect -2.349749e-04 NA NA
Similar.Values 1.152939e-04 NA NA
Successful.Family -1.139116e-03 NA NA
Trust -1.046174e-03 NA NA
Loyalty 4.127064e-04 NA NA
Family.Pride -9.711867e-07 NA NA
Expression -8.976143e-04 NA NA
Spend.Time.Together -3.881086e-04 NA NA
Feel.Close 3.533802e-04 NA NA
Togetherness -1.157534e-04 NA NA
Religious.Attendance -1.493394e-03 NA NA
Religious.Importance -1.103743e-03 NA NA
Close.knit.Community 3.038849e-03 NA NA
Helpful.Community -2.561196e-03 NA NA
Community.Shares.Values 1.147596e-03 NA NA
Get.Along 2.864171e-03 NA NA
Community.Trust -1.760672e-03 NA NA
Health.Insurance 2.784247e-02 NA NA
Dental.Insurance 2.954517e-02 NA NA
Discrimination 2.360783e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="PChk_test_VIMP.png",width=5,height=5,units="in")Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1915
Frequency of class labels: 786, 1129
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 472.9973
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1210
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.4364
(OOB) Brier score: 0.19312832
(OOB) Normalized Brier score: 0.77251328
(OOB) AUC: 0.76845291
(OOB) PR-AUC: 0.66442725
(OOB) G-mean: 0.69989128
(OOB) Requested performance error: 0.30010872
Confusion matrix:
predicted
observed 0 Yes class.error
0 566 220 0.2799
Yes 361 768 0.3198
(OOB) Misclassification rate: 0.3033943
plot(imb,plots.one.page = F)

all 0 Yes
Dental.Insurance 0.0207 NA NA
Health.Insurance 0.0023 NA NA
Income_median 0.0014 NA NA
Helpful.Community 0.0012 NA NA
EnglishDiff 0.0006 NA NA
Community.Trust 0.0002 NA NA
Spend.Time.Together -0.0006 NA NA
EnglishSpeak -0.0006 NA NA
Helpful.Friends -0.0011 NA NA
Employment -0.0012 NA NA
Family.Respect -0.0014 NA NA
Feel.Close -0.0015 NA NA
Togetherness -0.0016 NA NA
Loyalty -0.0017 NA NA
Family.Pride -0.0017 NA NA
Helpful.Family -0.0017 NA NA
Religion -0.0020 NA NA
Community.Shares.Values -0.0021 NA NA
Trust -0.0023 NA NA
Close.Friends -0.0025 NA NA
Discrimination -0.0026 NA NA
Close.knit.Community -0.0026 NA NA
See.Family -0.0028 NA NA
Expression -0.0031 NA NA
Successful.Family -0.0040 NA NA
Get.Along -0.0042 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1129.0000000 786.0000000 1.4363868 0.4104439 0.7201018 0.6802480
prec npv misclass brier brier.norm auc
0.6105717 0.7773279 0.3033943 0.1931283 0.7725133 0.7684529
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6608290 0.6916811 0.4104439 0.6644272 0.6803601 0.6957862
gmean
0.6998913
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1533
Frequency of class labels: 754, 779
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 300.131
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 969
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0332
(OOB) Brier score: 0.15189957
(OOB) Normalized Brier score: 0.60759826
(OOB) AUC: 0.89715441
(OOB) PR-AUC: 0.8931993
(OOB) G-mean: 0.81419411
(OOB) Requested performance error: 0.18580589
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 622 132 0.1751
0 153 626 0.1964
(OOB) Misclassification rate: 0.18591
print(rfobj) Sample size: 1533
Frequency of class labels: 754, 779
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 300.131
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 969
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0332
(OOB) Brier score: 0.15189957
(OOB) Normalized Brier score: 0.60759826
(OOB) AUC: 0.89715441
(OOB) PR-AUC: 0.8931993
(OOB) G-mean: 0.81419411
(OOB) Requested performance error: 0.18580589
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 622 132 0.1751
0 153 626 0.1964
(OOB) Misclassification rate: 0.18591
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Ethnicity 0.0347 NA NA
Religion 0.0230 NA NA
Dental.Insurance 0.0197 NA NA
Religious.Importance 0.0170 NA NA
Age 0.0164 NA NA
EnglishSpeak 0.0163 NA NA
EnglishDiff 0.0150 NA NA
See.Friends 0.0130 NA NA
Religious.Attendance 0.0117 NA NA
Close.knit.Community 0.0105 NA NA
Gender 0.0104 NA NA
Helpful.Friends 0.0098 NA NA
Income_median 0.0098 NA NA
Successful.Family 0.0098 NA NA
Community.Shares.Values 0.0092 NA NA
Helpful.Family 0.0091 NA NA
Spend.Time.Together 0.0091 NA NA
Feel.Close 0.0085 NA NA
Community.Trust 0.0085 NA NA
Family.Pride 0.0078 NA NA
Expression 0.0072 NA NA
Trust 0.0065 NA NA
Close.Friends 0.0065 NA NA
Close.Family 0.0065 NA NA
Togetherness 0.0059 NA NA
Family.Respect 0.0059 NA NA
rfobj$importance all Yes 0
Ethnicity 0.034717565 NA NA
Age 0.016382966 NA NA
Gender 0.010434258 NA NA
Religion 0.023025972 NA NA
Employment 0.001304812 NA NA
Income_median 0.009794819 NA NA
EnglishSpeak 0.016308280 NA NA
EnglishDiff 0.015007732 NA NA
See.Family 0.001310048 NA NA
Close.Family 0.006519843 NA NA
Helpful.Family 0.009138969 NA NA
See.Friends 0.013048118 NA NA
Close.Friends 0.006520897 NA NA
Helpful.Friends 0.009794819 NA NA
Family.Respect 0.005867704 NA NA
Similar.Values 0.005231874 NA NA
Successful.Family 0.009784236 NA NA
Trust 0.006520897 NA NA
Loyalty 0.003927042 NA NA
Family.Pride 0.007834149 NA NA
Expression 0.007185167 NA NA
Spend.Time.Together 0.009130510 NA NA
Feel.Close 0.008509011 NA NA
Togetherness 0.005867704 NA NA
Religious.Attendance 0.011740123 NA NA
Religious.Importance 0.016962553 NA NA
Close.knit.Community 0.010460735 NA NA
Helpful.Community 0.005216090 NA NA
Community.Shares.Values 0.009155888 NA NA
Get.Along 0.004565001 NA NA
Community.Trust 0.008479427 NA NA
Health.Insurance 0.000838941 NA NA
Dental.Insurance 0.019654670 NA NA
Discrimination 0.004562898 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="DChk_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
225.0000000 157.0000000 1.4331210 0.4109948 0.7961783 0.7022222
prec npv misclass brier brier.norm auc
0.6510417 0.8315789 0.2591623 0.1869835 0.7479338 0.7908846
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.7163324 0.7382005 0.4109948 0.6824346 0.7320292 0.7429632
gmean
0.7477260
test_rf$importance all Yes 0
Ethnicity 8.791599e-03 NA NA
Age 9.593166e-03 NA NA
Gender 2.667960e-03 NA NA
Religion 8.406042e-03 NA NA
Employment 1.257021e-03 NA NA
Income_median 5.186217e-03 NA NA
EnglishSpeak 1.030861e-02 NA NA
EnglishDiff 6.582020e-03 NA NA
See.Family 4.339230e-03 NA NA
Close.Family 3.508683e-03 NA NA
Helpful.Family 7.960862e-05 NA NA
See.Friends 3.703856e-03 NA NA
Close.Friends 3.804196e-03 NA NA
Helpful.Friends 1.915058e-03 NA NA
Family.Respect 1.003330e-03 NA NA
Similar.Values 2.391774e-03 NA NA
Successful.Family 3.963704e-03 NA NA
Trust -6.897584e-04 NA NA
Loyalty 1.530624e-03 NA NA
Family.Pride 6.049649e-04 NA NA
Expression -1.119621e-04 NA NA
Spend.Time.Together 1.312060e-03 NA NA
Feel.Close 1.578852e-03 NA NA
Togetherness -8.745424e-04 NA NA
Religious.Attendance 2.143785e-04 NA NA
Religious.Importance 4.069140e-03 NA NA
Close.knit.Community 1.658804e-03 NA NA
Helpful.Community 5.234839e-04 NA NA
Community.Shares.Values -2.675590e-03 NA NA
Get.Along 1.237687e-03 NA NA
Community.Trust 1.371337e-03 NA NA
Health.Insurance 8.434049e-03 NA NA
Dental.Insurance 7.611210e-02 NA NA
Discrimination 3.339864e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="DChk_test_VIMP.png",width=5,height=5,units="in")Urgent Care
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1908
Frequency of class labels: 1594, 314
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 357.6643
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1206
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 5.0764
(OOB) Brier score: 0.13517241
(OOB) Normalized Brier score: 0.54068965
(OOB) AUC: 0.59929952
(OOB) PR-AUC: 0.23059263
(OOB) G-mean: 0.56307797
(OOB) Requested performance error: 0.43692203
Confusion matrix:
predicted
observed 0 Yes class.error
0 818 776 0.4868
Yes 120 194 0.3822
(OOB) Misclassification rate: 0.4696017
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0126 NA NA
Family.Pride 0.0055 NA NA
Discrimination 0.0055 NA NA
Spend.Time.Together 0.0046 NA NA
Health.Insurance 0.0046 NA NA
Helpful.Family 0.0039 NA NA
Close.Family 0.0037 NA NA
Trust 0.0022 NA NA
Similar.Values 0.0022 NA NA
Dental.Insurance 0.0008 NA NA
Employment 0.0003 NA NA
Togetherness 0.0000 NA NA
Income_median 0.0000 NA NA
Helpful.Community -0.0005 NA NA
See.Friends -0.0015 NA NA
Loyalty -0.0021 NA NA
Close.knit.Community -0.0021 NA NA
Expression -0.0025 NA NA
Community.Shares.Values -0.0033 NA NA
Feel.Close -0.0036 NA NA
Family.Respect -0.0040 NA NA
Helpful.Friends -0.0060 NA NA
Successful.Family -0.0068 NA NA
Ethnicity -0.0068 NA NA
EnglishDiff -0.0071 NA NA
Gender -0.0072 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1594.0000000 314.0000000 5.0764331 0.1645702 0.6178344 0.5131744
prec npv misclass brier brier.norm auc
0.2000000 0.8720682 0.4696017 0.1351724 0.5406897 0.5992995
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3021807 0.4117806 0.1645702 0.2305926 0.4326293 0.4874293
gmean
0.5630780
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1528
Frequency of class labels: 751, 777
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 305.7
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 966
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0346
(OOB) Brier score: 0.15162729
(OOB) Normalized Brier score: 0.60650918
(OOB) AUC: 0.93551712
(OOB) PR-AUC: 0.92339775
(OOB) G-mean: 0.86102207
(OOB) Requested performance error: 0.13897793
Confusion matrix:
predicted
observed 0 Yes class.error
0 639 112 0.1491
Yes 100 677 0.1287
(OOB) Misclassification rate: 0.1387435
print(rfobj) Sample size: 1528
Frequency of class labels: 751, 777
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 305.7
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 966
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0346
(OOB) Brier score: 0.15162729
(OOB) Normalized Brier score: 0.60650918
(OOB) AUC: 0.93551712
(OOB) PR-AUC: 0.92339775
(OOB) G-mean: 0.86102207
(OOB) Requested performance error: 0.13897793
Confusion matrix:
predicted
observed 0 Yes class.error
0 639 112 0.1491
Yes 100 677 0.1287
(OOB) Misclassification rate: 0.1387435
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Ethnicity 0.0337 NA NA
Religion 0.0258 NA NA
See.Friends 0.0235 NA NA
Age 0.0227 NA NA
EnglishSpeak 0.0226 NA NA
Community.Shares.Values 0.0221 NA NA
Religious.Attendance 0.0167 NA NA
Close.Friends 0.0163 NA NA
Income_median 0.0162 NA NA
Dental.Insurance 0.0160 NA NA
Get.Along 0.0148 NA NA
Helpful.Community 0.0135 NA NA
EnglishDiff 0.0134 NA NA
Discrimination 0.0122 NA NA
Close.knit.Community 0.0121 NA NA
Togetherness 0.0117 NA NA
Religious.Importance 0.0103 NA NA
Loyalty 0.0095 NA NA
Community.Trust 0.0084 NA NA
Helpful.Friends 0.0083 NA NA
Similar.Values 0.0076 NA NA
Family.Pride 0.0076 NA NA
Trust 0.0075 NA NA
Close.Family 0.0075 NA NA
Helpful.Family 0.0073 NA NA
Spend.Time.Together 0.0058 NA NA
rfobj$importance all 0 Yes
Ethnicity 0.0336813046 NA NA
Age 0.0227095880 NA NA
Gender 0.0032194041 NA NA
Religion 0.0258236476 NA NA
Employment 0.0024545883 NA NA
Income_median 0.0161892702 NA NA
EnglishSpeak 0.0225562836 NA NA
EnglishDiff 0.0133865189 NA NA
See.Family -0.0001910503 NA NA
Close.Family 0.0075445536 NA NA
Helpful.Family 0.0072624871 NA NA
See.Friends 0.0235000617 NA NA
Close.Friends 0.0163150446 NA NA
Helpful.Friends 0.0082827844 NA NA
Family.Respect 0.0036490383 NA NA
Similar.Values 0.0076168419 NA NA
Successful.Family 0.0051023980 NA NA
Trust 0.0075445536 NA NA
Loyalty 0.0095155977 NA NA
Family.Pride 0.0075957572 NA NA
Expression 0.0030955495 NA NA
Spend.Time.Together 0.0058044547 NA NA
Feel.Close 0.0018141052 NA NA
Togetherness 0.0116808576 NA NA
Religious.Attendance 0.0166752282 NA NA
Religious.Importance 0.0103401445 NA NA
Close.knit.Community 0.0121198210 NA NA
Helpful.Community 0.0135199656 NA NA
Community.Shares.Values 0.0220679404 NA NA
Get.Along 0.0148292625 NA NA
Community.Trust 0.0083611648 NA NA
Health.Insurance 0.0011182957 NA NA
Dental.Insurance 0.0160168680 NA NA
Discrimination 0.0121682723 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="UC_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
318.0000000 62.0000000 5.1290323 0.1631579 0.4516129 0.6226415
prec npv misclass brier brier.norm auc
0.1891892 0.8534483 0.4052632 0.1379955 0.5519820 0.5185636
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.2666667 0.3891892 0.1631579 0.1733409 0.3984715 0.4597327
gmean
0.5302763
test_rf$importance all 0 Yes
Ethnicity -5.484342e-03 NA NA
Age 9.448379e-03 NA NA
Gender -1.361491e-03 NA NA
Religion -4.088015e-03 NA NA
Employment 2.721016e-03 NA NA
Income_median 1.204174e-03 NA NA
EnglishSpeak 3.732555e-03 NA NA
EnglishDiff 5.565420e-04 NA NA
See.Family 3.524575e-03 NA NA
Close.Family -1.506214e-03 NA NA
Helpful.Family 2.415261e-03 NA NA
See.Friends -1.858872e-03 NA NA
Close.Friends 1.177293e-02 NA NA
Helpful.Friends -1.709392e-03 NA NA
Family.Respect 1.392512e-03 NA NA
Similar.Values -7.695964e-04 NA NA
Successful.Family -3.196832e-04 NA NA
Trust -4.241532e-05 NA NA
Loyalty 1.841446e-03 NA NA
Family.Pride -7.113295e-04 NA NA
Expression -2.343188e-03 NA NA
Spend.Time.Together -1.792190e-03 NA NA
Feel.Close -1.040139e-03 NA NA
Togetherness -1.837450e-03 NA NA
Religious.Attendance -3.902555e-03 NA NA
Religious.Importance -3.452213e-03 NA NA
Close.knit.Community 3.846090e-03 NA NA
Helpful.Community 1.723408e-03 NA NA
Community.Shares.Values -2.957451e-04 NA NA
Get.Along -3.175906e-03 NA NA
Community.Trust 3.151664e-03 NA NA
Health.Insurance 7.437195e-04 NA NA
Dental.Insurance -6.805305e-05 NA NA
Discrimination -7.013379e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="UC_test_VIMP.png",width=5,height=5,units="in")Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1899
Frequency of class labels: 1642, 257
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 306.211
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1200
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 6.3891
(OOB) Brier score: 0.11175568
(OOB) Normalized Brier score: 0.44702273
(OOB) AUC: 0.67616364
(OOB) PR-AUC: 0.23627522
(OOB) G-mean: 0.62877744
(OOB) Requested performance error: 0.37122256
Confusion matrix:
predicted
observed 0 Yes class.error
0 970 672 0.4093
Yes 85 172 0.3307
(OOB) Misclassification rate: 0.3986309
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0299 NA NA
Ethnicity 0.0196 NA NA
Helpful.Friends 0.0083 NA NA
EnglishSpeak 0.0072 NA NA
Togetherness 0.0066 NA NA
Feel.Close 0.0062 NA NA
Family.Pride 0.0061 NA NA
Family.Respect 0.0060 NA NA
Religion 0.0060 NA NA
Close.knit.Community 0.0051 NA NA
Community.Trust 0.0050 NA NA
Trust 0.0047 NA NA
Close.Friends 0.0046 NA NA
EnglishDiff 0.0046 NA NA
See.Friends 0.0044 NA NA
Employment 0.0042 NA NA
Religious.Importance 0.0034 NA NA
Dental.Insurance 0.0031 NA NA
Loyalty 0.0031 NA NA
Get.Along 0.0026 NA NA
Health.Insurance 0.0019 NA NA
Similar.Values 0.0019 NA NA
See.Family 0.0018 NA NA
Expression 0.0011 NA NA
Helpful.Community 0.0009 NA NA
Helpful.Family -0.0002 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1642.0000000 257.0000000 6.3891051 0.1353344 0.6692607 0.5907430
prec npv misclass brier brier.norm auc
0.2037915 0.9194313 0.3986309 0.1117557 0.4470227 0.6761636
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3124432 0.4356551 0.1353344 0.2362752 0.4706103 0.5322163
gmean
0.6287774
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")
print(rfobj) Sample size: 1520
Frequency of class labels: 747, 773
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 289.387
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 961
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0348
(OOB) Brier score: 0.13528264
(OOB) Normalized Brier score: 0.54113056
(OOB) AUC: 0.94598402
(OOB) PR-AUC: 0.94457272
(OOB) G-mean: 0.87279673
(OOB) Requested performance error: 0.12720327
Confusion matrix:
predicted
observed 0 Yes class.error
0 632 115 0.1539
Yes 77 696 0.0996
(OOB) Misclassification rate: 0.1263158
print(rfobj) Sample size: 1520
Frequency of class labels: 747, 773
Number of trees: 1000
Forest terminal node size: 1
Average no. of terminal nodes: 289.387
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 961
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0348
(OOB) Brier score: 0.13528264
(OOB) Normalized Brier score: 0.54113056
(OOB) AUC: 0.94598402
(OOB) PR-AUC: 0.94457272
(OOB) G-mean: 0.87279673
(OOB) Requested performance error: 0.12720327
Confusion matrix:
predicted
observed 0 Yes class.error
0 632 115 0.1539
Yes 77 696 0.0996
(OOB) Misclassification rate: 0.1263158
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Ethnicity 0.0279 NA NA
EnglishSpeak 0.0202 NA NA
Discrimination 0.0178 NA NA
Age 0.0163 NA NA
Community.Shares.Values 0.0141 NA NA
Family.Pride 0.0132 NA NA
Religion 0.0131 NA NA
EnglishDiff 0.0127 NA NA
Religious.Importance 0.0125 NA NA
See.Friends 0.0103 NA NA
Helpful.Community 0.0094 NA NA
Religious.Attendance 0.0093 NA NA
Helpful.Family 0.0091 NA NA
Community.Trust 0.0083 NA NA
Feel.Close 0.0082 NA NA
Close.Friends 0.0076 NA NA
Get.Along 0.0075 NA NA
Gender 0.0057 NA NA
Close.knit.Community 0.0047 NA NA
Family.Respect 0.0036 NA NA
Health.Insurance 0.0032 NA NA
Helpful.Friends 0.0030 NA NA
Trust 0.0026 NA NA
Togetherness 0.0025 NA NA
Spend.Time.Together 0.0022 NA NA
Dental.Insurance 0.0019 NA NA
rfobj$importance all 0 Yes
Ethnicity 0.0279362812 NA NA
Age 0.0162905709 NA NA
Gender 0.0057173663 NA NA
Religion 0.0131471592 NA NA
Employment -0.0009241508 NA NA
Income_median -0.0009241508 NA NA
EnglishSpeak 0.0202120121 NA NA
EnglishDiff 0.0126768900 NA NA
See.Family 0.0000000000 NA NA
Close.Family 0.0001905046 NA NA
Helpful.Family 0.0090983789 NA NA
See.Friends 0.0102992711 NA NA
Close.Friends 0.0076098662 NA NA
Helpful.Friends 0.0030271957 NA NA
Family.Respect 0.0035550006 NA NA
Similar.Values 0.0009211578 NA NA
Successful.Family 0.0018234059 NA NA
Trust 0.0025723455 NA NA
Loyalty 0.0018234059 NA NA
Family.Pride 0.0132408410 NA NA
Expression 0.0004485458 NA NA
Spend.Time.Together 0.0021893411 NA NA
Feel.Close 0.0082416201 NA NA
Togetherness 0.0025116505 NA NA
Religious.Attendance 0.0093179662 NA NA
Religious.Importance 0.0125077768 NA NA
Close.knit.Community 0.0047312659 NA NA
Helpful.Community 0.0093620909 NA NA
Community.Shares.Values 0.0141419161 NA NA
Get.Along 0.0075057862 NA NA
Community.Trust 0.0082957061 NA NA
Health.Insurance 0.0032004397 NA NA
Dental.Insurance 0.0019447044 NA NA
Discrimination 0.0177739413 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)
ggsave(filename="Folk_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
328.0000000 51.0000000 6.4313725 0.1345646 0.5686275 0.5823171
prec npv misclass brier brier.norm auc
0.1746988 0.8967136 0.4195251 0.1160561 0.4642243 0.5974414
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.2672811 0.3877765 0.1345646 0.1643881 0.4213563 0.4816040
gmean
0.5754316
test_rf$importance all 0 Yes
Ethnicity 6.012949e-03 NA NA
Age 2.759423e-02 NA NA
Gender 2.741019e-04 NA NA
Religion -1.157462e-03 NA NA
Employment 6.041262e-03 NA NA
Income_median 2.603965e-03 NA NA
EnglishSpeak 1.060551e-02 NA NA
EnglishDiff 3.417156e-03 NA NA
See.Family -2.539399e-03 NA NA
Close.Family -7.752321e-04 NA NA
Helpful.Family -6.497141e-03 NA NA
See.Friends -1.467569e-04 NA NA
Close.Friends 4.107855e-03 NA NA
Helpful.Friends -1.311790e-03 NA NA
Family.Respect 3.383756e-05 NA NA
Similar.Values 1.622703e-03 NA NA
Successful.Family -6.023257e-04 NA NA
Trust -3.814687e-03 NA NA
Loyalty -2.942095e-03 NA NA
Family.Pride -9.649487e-04 NA NA
Expression -8.642067e-04 NA NA
Spend.Time.Together 2.216063e-03 NA NA
Feel.Close 2.218099e-03 NA NA
Togetherness -7.018356e-04 NA NA
Religious.Attendance 2.815081e-04 NA NA
Religious.Importance -2.434106e-03 NA NA
Close.knit.Community -2.458493e-04 NA NA
Helpful.Community -1.205382e-03 NA NA
Community.Shares.Values -1.880313e-03 NA NA
Get.Along -1.524397e-03 NA NA
Community.Trust -1.668409e-03 NA NA
Health.Insurance -4.654616e-04 NA NA
Dental.Insurance -2.823929e-03 NA NA
Discrimination -1.283090e-04 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|>
mutate(fill = case_when(variable=="Ethnicity"~"red",
.default="black"))
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw() +
scale_fill_manual(values=c("black","red"),
guide="none")
importance_plot
ggsave(filename="Folk_test_VIMP.png",width=5,height=5,units="in")